Identifying duplicate datasets in OBIS
This notebook tries to identify duplicate datasets in the OBIS system. This is done by aggregating datasets by geohash, species, and year, and calculating cosine similarities between datasets. Skip to results here.
How to use this notebook
Delete the vectors directory to calculate vectors based on the latest data.
Dependencies
library(tidyr)
library(glue)
library(Matrix)
library(dplyr)
library(arrow)
library(dplyr)
library(geohashTools)
library(data.table)
library(reactable)Fetch occurrences from S3
This fetches just the necessary columns from the OBIS export on S3.
space <- S3FileSystem$create(
anonymous = TRUE,
scheme = "https",
endpoint_override = "ams3.digitaloceanspaces.com"
)
occ <- open_dataset(space$path("obis-datasets/exports/obis_20220208.parquet")) %>%
select(dataset_id, decimalLongitude, decimalLatitude, AphiaID, year = date_year) %>%
as_tibble()Aggregate datasets by geohash, species, and year
stats <- occ %>%
filter(!is.na(year) & decimalLatitude < 90 & decimalLongitude < 180) %>%
mutate(geohash = gh_encode(decimalLatitude, decimalLongitude, 2)) %>%
mutate(cell = factor(paste(geohash, AphiaID, year, sep = "_"))) %>%
group_by(dataset_id, cell) %>%
summarize(records = n())Generate vectors and calculate similarity
First generate vectors by dataset and store as sparseVector.
n_cells <- length(levels(stats$cell))
dataset_ids <- unique(stats$dataset_id)
vectors <- list()
for (id in dataset_ids) {
message(id)
vector <- rep(0, n_cells)
dataset <- stats %>%
filter(dataset_id == id)
for (i in 1:nrow(dataset)) {
vector[as.numeric(dataset$cell[i])] <- dataset$records[i]
}
vectors[[id]] <- as(vector, "sparseVector")
}Calculate similarities using parallelization and write the results to a text file.
write("x y similarity", file = "similarity.txt", append = FALSE)
parallel::mclapply(1:length(dataset_ids), function(i) {
dataset_x <- dataset_ids[i]
x <- as.vector(vectors[[dataset_x]])
for (j in (i + 1):length(dataset_ids)) {
dataset_y <- dataset_ids[j]
y <- as.vector(vectors[[dataset_y]])
similarity <- coop::cosine(x, y)
line <- paste(dataset_x, dataset_y, format(similarity, scientific = FALSE))
write(line, file = "similarity.txt", append = TRUE)
}
}, mc.cores = 6)Plot the similarities to get an idea of the distribution.
similarity <- fread("similarity.txt", sep = " ", header = TRUE)
plot(similarity$similarity)Create a shortlist of suspect dataset pairs.
datasets <- robis::dataset() %>%
tidyr::unnest(statistics) %>%
select(id, url, title, records = Occurrence)
suspect <- similarity %>%
filter(similarity > 0.85) %>%
left_join(datasets, by = c("x" = "id")) %>%
left_join(datasets, by = c("y" = "id"), suffix = c("_x", "_y")) %>%
arrange(desc(similarity)) %>%
as_tibble()
suspect ## # A tibble: 426 × 9
## x y similarity url_x title_x records_x url_y title_y records_y
## <chr> <chr> <dbl> <chr> <chr> <int> <chr> <chr> <int>
## 1 01f35… bad2f… 1 http:/… Movemen… 143568 http:… Movemen… 2
## 2 08fb2… 40e33… 1 http:/… Nest ce… 5056 https… Nest ce… 5056
## 3 0a68b… 4b51b… 1 http:/… Microzo… 74 http:… Nationa… 74
## 4 0dd1c… 6bc62… 1 http:/… Leopard… 534 https… Leopard… 534
## 5 0e40e… 48f91… 1 https:… RMT Tra… 534 http:… RMT Tra… 547
## 6 11863… 5928a… 1 https:… Bunger … 140 http:… Bunger … 140
## 7 12abc… d7fa8… 1 http:/… Norfish… 271 http:… Norfish… 410
## 8 13347… 1e5d0… 1 http:/… Phytopl… 1098 http:… Joint O… 1098
## 9 15665… 31b19… 1 http:/… Microzo… 72 http:… Nationa… 72
## 10 15efe… 93176… 1 https:… IOMPMN1… 310 https… IOMPMN1… 310
## # … with 416 more rows
Results
suspect %>%
mutate(
title = glue("<a href=\"https://obis.org/dataset/{x}\" target=\"_blank\">{title_x}</a><br/><br/><a href=\"https://obis.org/dataset/{y}\" target=\"_blank\">{title_y}</a>"),
url = glue("<a href=\"{url_x}\" target=\"_blank\">{url_x}</a><br/><br/><a href=\"{url_y}\" target=\"_blank\">{url_y}</a>"),
records = glue("{records_x}<br/><br/>{records_y}")) %>%
select(similarity, title, url, records) %>%
reactable(columns = list(similarity = colDef(width = 100), title = colDef(html = TRUE), url = colDef(html = TRUE), records = colDef(html = TRUE, width = 100)), pagination = FALSE)